Sankey-Alluvial-Transitions

Adam Garber

9/2/2020

library(MplusAutomation)
library(tidyverse)       
library(here)            
library(gt) 

# sankey / alluvial packages
library(networkD3)
library(parcats)
library(easyalluvial)
library(ggalluvial)

Testing Sankey & Alluvial Daigram Variants


LTA context issues:

  1. The parameters typically used is in LTA plots are the transition probabilities (think the stratum b/w categorical variables) & class sizes (proportions in each category).
  2. Longitudinal model: 1st nominal variable (left) is time-1 with 4-classes and 2nd variable (right) is for time-2 with 4-classes
  3. Missing from these plots is the time-1 class sizes (currently each class is incorrectly 25%). I am not sure how to incorporate the true class size information.

Read invariance model and extract parameters (intercepts and multinomial regression coefficients)

lta_inv1 <- readModels(here("4-Class-Invariant.out" ), quiet = TRUE)
## <simpleError in file(file, "rt"): cannot open the connection>
par <- as_tibble(lta_inv1[["parameters"]][["unstandardized"]]) %>% 
select(1:3) %>% 
  filter(grepl('ON|Means', paramHeader)) %>% 
  mutate(est = as.numeric(est))

Manual method to calculate transition probabilities

# Name each parameter individually to make the subsequent calculations more readable
a1 <- unlist(par[13,3]); a2 <- unlist(par[14,3]); a3 <- unlist(par[15,3]); b11 <- unlist(par[1,3]);
b21 <- unlist(par[4,3]); b31 <- unlist(par[7,3]); b12 <- unlist(par[2,3]); b22 <- unlist(par[5,3]);
b32 <- unlist(par[8,3]); b13 <- unlist(par[3,3]); b23 <- unlist(par[6,3]); b33 <- unlist(par[9,3])

# Calculate transition probabilities from the logit parameters
t11 <- exp(a1+b11)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t12 <- exp(a2+b21)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t13 <- exp(a3+b31)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t14 <- 1 - (t11 + t12 + t13)

t21 <- exp(a1+b12)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t22 <- exp(a2+b22)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t23 <- exp(a3+b32)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t24 <- 1 - (t21 + t22 + t23)

t31 <- exp(a1+b13)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t32 <- exp(a2+b23)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t33 <- exp(a3+b33)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t34 <- 1 - (t31 + t32 + t33)

t41 <- exp(a1)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t42 <- exp(a2)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t43 <- exp(a3)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t44 <- 1 - (t41 + t42 + t43)

Create transition table

t_matrix <- tibble(
  "Time1" = c("C1=1","C1=2","C1=3","C1=4"),
  "C2=1" = c(t11,t21,t31,t41),
  "C2=2" = c(t12,t22,t32,t42),
  "C2=3" = c(t13,t23,t33,t43),
  "C2=4" = c(t14,t24,t34,t44))

t_matrix %>% 
  gt(rowname_col = "Time1") %>% 
  tab_header(
    title = md("**Student transitions from 7th grade (rows) to 10th grade (columns)**"),
    subtitle = md("&nbsp;")) %>% 
  fmt_number(2:5,decimals = 2) %>% 
  tab_spanner(label = "10th grade",columns = 2:5)
Student transitions from 7th grade (rows) to 10th grade (columns)
 
10th grade
C2=1 C2=2 C2=3 C2=4
C1=1 0.27 0.27 0.32 0.15
C1=2 0.09 0.56 0.19 0.16
C1=3 0.15 0.21 0.52 0.12
C1=4 0.08 0.35 0.27 0.30

Plot LTA transitions


Type 1: LTA plot using plotLTA from the package {MplusAutomation}

  • issue1: node proportions & transition proportions information is missing (these need to be labeled!)
  • issue2: change to faceted plot
MplusAutomation::plotLTA(lta_inv1)

Type 2: Sankey interactive diagram using the package {networkD3}

  • issue 1: Proportions at time 1 are incorrect. Unclear how to incorporate initial class size data into chart.
  • issue 2: Unclear how to label node and transition. Examples to work from are unavailable.

Change to long-format

trans_long <- t_matrix %>% 
  pivot_longer(`C2=1`:`C2=4`, # The columns I'm gathering together
           names_to = "Time2", # new column name for existing names
         values_to = "value") # new column name to store values

nodes <- data.frame(name=c(as.character(trans_long$Time1),
                           as.character(trans_long$Time2)) %>%
                      unique())

trans_long$IDTime1=match(trans_long$Time1, nodes$name)-1 
trans_long$IDTime2=match(trans_long$Time2, nodes$name)-1

Prepare color scale

ColourScal ='d3.scaleOrdinal().range([
"#FDE725FF","#B4DE2CFF","#6DCD59FF","#35B779FF","#1F9E89FF",
"#26828EFF","#31688EFF","#3E4A89FF","#482878FF","#440154FF"])'

It’s interactive!

sankeyNetwork(Links = trans_long, Nodes = nodes,
              Source = "IDTime1", Target = "IDTime2",
              Value = "value", NodeID = "name", 
              sinksRight=FALSE, colourScale=ColourScal,
              nodeWidth=40, fontSize=13, nodePadding=20)

Type 3: LTA alluvial plot using the package {ggalluvial}

  • issue 1: Proportions at time 1 are incorrect. Unclear how to incorporate initial class size data into chart.
  • issue 2: Unclear how to label node and transitions in a clear manner. Examples to work from are unavailable.
#library(gghighlight)

ggplot(trans_long,
       aes(axis1 = Time1, axis2 = Time2,
           y = value)) +
  scale_x_discrete(limits = c("7th Grade", "10th Grade"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = Time1), show.legend = FALSE) +
  geom_stratum() +
  #geom_text(stat = "alluvium", aes(label = round(value,2), color = Time1),
  #          show.legend = FALSE, fontface = 2, position = position_nudge(x = -0.21)) +
  theme_minimal()

  # + gghighlight(label_key = value)

Type 4: A cool interactive alluvial plot using the package {easyalluvial}

Currently this plot makes little sense, it was just the first version I could get to run. More exploration needed!

library(parcats)
library(easyalluvial)

p = alluvial_wide(t_matrix, max_variables = 5)

parcats(p, marginal_histograms = TRUE, data_input = t_matrix)

Cite all packages

#library(grateful)
cite_packages()

Second attempt: go the ggplot route (rather than rely on a package)


Things I like about this solution:

  • Faceted plot: The focus in LTA is on the transitions therefore having all 16 transitions superimposed is not ideal
  • Labeled transitions & nodes: This info is critical to my application

Things I DON’T like:

  • I prefer the Alluvial/Sankey style of having the discrete categories represented by a bar (left/right side of plots) cut into pieces. This makes sense in the LTA because the whole sample (\(P=1\)) is proportioned into classes. The nodes don’t capture this part/whole relationship as clearly.

library(PNWColors)
library(ggrepel)
library(glue)

load the model

mplusModel <- readModels(filefilter = "sci_attitude", quiet = TRUE)
## <simpleError in file(file, "rt"): cannot open the connection>

This code is derived from the plotLTA function found with the source code from the MplusAutomation package

  • Since the code I started with was written for a package it still contains remnants of code related to the function arguments.
  • Sorry about the mess!

Source code: https://github.com/michaelhallquist/MplusAutomation/blob/995d1ecfae3656524153456ce647f86fe8c1cf1e/R/mixtures.R


Function arguments for plotLTA

# set the thickness of the node (circle outlines)
node_stroke = 2

# set the maximum thickness of the edges (lines)
max_edge_width = 2

Extract edge data

edges <- mplusModel$class_counts$transitionProbs
# create vector with all classes 
all_classes <- unique(c(edges$from, edges$to))

# create vector of latent variable names only
latent_variables <- unique(gsub("\\..+$", "", all_classes))

# create 4 new columns a start and end for each axis of plot
edges$x <- as.numeric(factor(gsub("\\..+$", "", as.character(edges$from)), levels = latent_variables))

edges$xend <-as.numeric(factor(gsub("\\..+$", "", as.character(edges$to)), levels = latent_variables))

edges$y <- as.numeric(gsub("^.+\\.", "", as.character(edges$from)))

edges$yend <- as.numeric(gsub("^.+\\.", "", as.character(edges$to)))

# transition labels:
# 1. create column of transitions in percent scale
# 2. calculate label positions = y-value of segment midpoints
edges <-  edges %>% 
  mutate(trans_perc = round(probability*100,0)) %>% 
  mutate(y_mid = y + ((yend - y)/2)) 

Create a nodes data.frame

nodes <- rbind(edges[, c(1, 4, 6)], setNames(edges[, c(2, 5, 7)], names(edges)[c(1, 4, 6)]))

nodes <- nodes[!duplicated(nodes),]

names(nodes)[1] <- "nodeID"

Create a n_prop data.frame containing class count/proportions used to set edge & node thickness.

# Extract counts and proportions
n_prop <- mplusModel$class_counts$modelEstimated %>% 
  mutate(percent = round(proportion*100,0))

# Convert 'proportions' column to stroke thickness values
if (!is.null(node_stroke)) {
  n_prop$proportion <-
    node_stroke * n_prop$proportion * inverse.rle(list(
      lengths = rle(n_prop$variable)$lengths,
      values = rle(n_prop$variable)$lengths
    ))
} else {
  n_prop$proportion <- 1
}

# Add column nodeID
n_prop$nodeID <- paste(n_prop$variable, n_prop$class, sep = ".")

# merge data.frames 'nprop' and 'nodes'
nodes <- merge(nodes, n_prop, by = "nodeID") 

# ------------------------------------------
# FAILED ATTEMPTS AT RE-ORDERING CLASSES 
# (classes are nominal but plot dimensions are treated as continuous) 
# ------------------------------------------
# re-order nominal classes based on researcher preference
# nodes <- nodes %>%
#  mutate(y = case_when(
#    y ==  2 ~ 1,
#    y ==  1 ~ 2,
#    y ==  3 ~ 3,
#    y ==  4 ~ 4
#  ))
# nodes <- nodes %>%
#   mutate(y = factor(y,
#         levels = c("2","1","3","4"), 
#         labels = c("C1","C2","C3","C4")))
# ------------------------------------------

Set node size value

# original value from function = 3.880556
nodesize <- max(max(sapply(nodes$nodeID, nchar)) * 4.5, 6)

Make the plot with ggplot in a step-wise fashion


  1. start with blank slate
p0 <- ggplot(NULL)

p0


  1. Create the edges with geom_segment()
p1 <- p0 + geom_segment(
  data = edges,
  aes(
    x = x,
    y = y,
    xend = xend,
    yend = yend,
    size = probability,
    alpha = probability),
    color = "black") +
  scale_x_continuous(breaks = c(1,2), labels = c('1' = "7th Grade", '2' = "10th Grade")) +
  scale_alpha_continuous(range = c(.05,.8))

p1

This is too much going on with all 16 overlapping transitions


  1. Facet wrap by class + some extra formatting:
  1. re-label the facets manually
  2. re-label the x-axis
  3. expand grid to make room for large nodes
  4. reverse y-axis scale so ‘4’ is at the top
  5. add the transition label layer to the plot
p2 <- p1 +
  facet_wrap(~y, labeller = labeller(
  y = c(`1` = "Transitions from K = 1 in 7th grade", `2` = "Transitions from K = 2 in 7th grade",
        `3` = "Transitions from K = 3 in 7th grade", `4` = "Transitions from K = 4 in 7th grade"))) +
  scale_x_continuous(expand = c(.1, .1), breaks = c(1,2), labels = c('1' = "7th Grade", '2' = "10th Grade")) + 
  scale_y_reverse(expand = c(.15, .15)) +
  geom_text_repel(data=edges, aes(x=xend, y=y_mid, label = glue("{trans_perc}%")), 
                   segment.colour = NA, 
                   nudge_x = -.5
                   )

p2

  1. Add nodes (first attempt)
p3 <- p2 + geom_point(
     data = nodes,
     shape = 21,
     size = nodesize,
     colour = "black",
     fill = "white",
     aes(x = x, y = y, stroke = proportion)
  )

p3 

This is not what I want I need all 8 nodes to be printed within each facet


  1. Add nodes (second attempt)

To include all 8 nodes in each facet a data.frame was used nodes2 in which the faceting variable y is absent.

nodes2 <- nodes %>% rename(x1 = x, y1 = y) %>% 
  mutate(class = factor(class))

p4 <- p2 + geom_point(
     data = nodes2,
     shape = 21,
     size = nodesize,
     fill = "white",
     aes_string(x = "x1", y = "y1", stroke = "proportion")
  ) 

p4 

  1. Add in node labels
p5 <- p4 +
  geom_text(data = nodes2, aes(x = x1, y = y1, label = paste0(percent,"%"))) +
  theme_void() + 
  theme(axis.text.x = element_text())

p5

Save BW transition plot

ggsave(here("figures","bw_transition_plot.png"),                           
       dpi=500, height=5, width=8, units="in")                 

Create color version of figure


pallete=pnw_palette("Bay", n=4, type = "discrete")

p6 <- p3 + geom_point(
     data = nodes2,
     shape = 21,
     size = nodesize,
     color = "black",
     aes_string(x = "x1", y = "y1", stroke = "proportion", fill = "class"), show.legend = FALSE) +
  scale_fill_manual("", values = rev(pallete)) +
  geom_text(data = nodes2, aes(x = x1, y = y1, label = paste0(percent,"%"))) +
  theme_void()

p6 

Save color transition plot

ggsave(here("figures","color_transition_plot.png"),                           
       dpi=500, height=5, width=8, units="in")